library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(tidyverse)
Warning: package ‘tidyverse’ was built under R version 4.2.3Warning: package ‘ggplot2’ was built under R version 4.2.3Warning: package ‘tidyr’ was built under R version 4.2.3Warning: package ‘readr’ was built under R version 4.2.3Warning: package ‘purrr’ was built under R version 4.2.3Warning: package ‘forcats’ was built under R version 4.2.3Warning: package ‘lubridate’ was built under R version 4.2.3── Attaching core tidyverse packages ─────────────────────────────────── tidyverse 2.0.0 ──
✔ forcats 1.0.0 ✔ readr 2.1.4
✔ ggplot2 3.4.3 ✔ stringr 1.5.0
✔ lubridate 1.9.2 ✔ tibble 3.1.8
✔ purrr 1.0.2 ✔ tidyr 1.3.0── Conflicts ───────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(tidyr)
library(plotly)
Warning: package ‘plotly’ was built under R version 4.2.3Registered S3 method overwritten by 'data.table':
method from
print.data.table
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Attaching package: ‘plotly’
The following object is masked from ‘package:ggplot2’:
last_plot
The following object is masked from ‘package:stats’:
filter
The following object is masked from ‘package:graphics’:
layout
library(ggplot2)
library(coefplot)
Warning: package ‘coefplot’ was built under R version 4.2.3
library(corrplot)
Warning: package ‘corrplot’ was built under R version 4.2.3corrplot 0.92 loaded
library(plotly)
library(caTools)
Warning: package ‘caTools’ was built under R version 4.2.3
library(Metrics)
Warning: package ‘Metrics’ was built under R version 4.2.3
mathData = read.csv('studentMat.csv')
porData = read.csv('studentPor.csv')
#EDA
school - student’s school (binary: ‘GP’ - Gabriel Pereira or ‘MS’ - Mousinho da Silveira) 2. sex - student’s sex (binary: ‘F’ - female or ‘M’ - male) 3. age - student’s age (numeric: from 15 to 22) 4. address - student’s home address type (binary: ‘U’ - urban or ‘R’ - rural) 5. famsize - family size (binary: ‘LE3’ - less or equal to 3 or ‘GT3’ - greater than 3) 6. Pstatus - parent’s cohabitation status (binary: ‘T’ - living together or ‘A’ - apart) 7. Medu - mother’s education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education) 8. Fedu - father’s education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education) 9. Mjob - mother’s job (nominal: ‘teacher’, ‘health’ care related, civil ‘services’ (e.g. administrative or police), ‘at_home’ or ‘other’) 10. Fjob - father’s job (nominal: ‘teacher’, ‘health’ care related, civil ‘services’ (e.g. administrative or police), ‘at_home’ or ‘other’) 11. reason - reason to choose this school (nominal: close to ‘home’, school ‘reputation’, ‘course’ preference or ‘other’) 12. guardian - student’s guardian (nominal: ‘mother’, ‘father’ or ‘other’) 13. traveltime - home to school travel time (numeric: 1 - 1 hour) 14. studytime - weekly study time (numeric: 1 - 10 hours) 15. failures - number of past class failures (numeric: n if 1<=n<3, else 4) 16. schoolsup - extra educational support (binary: yes or no) 17. famsup - family educational support (binary: yes or no) 18. paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no) 19. activities - extra-curricular activities (binary: yes or no) 20. nursery - attended nursery school (binary: yes or no) higher - wants to take higher education (binary: yes or no) 22. internet - Internet access at home (binary: yes or no) 23. romantic - with a romantic relationship (binary: yes or no) 24. famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent) 25. freetime - free time after school (numeric: from 1 - very low to 5 - very high) 26. goout - going out with friends (numeric: from 1 - very low to 5 - very high) 27. Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high) 28. Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high) 29. health - current health status (numeric: from 1 - very bad to 5 - very good) 30. absences - number of school absences (numeric: from 0 to 93
mathData
porData
NA
ggplot(porData, aes(x = school)) +
geom_bar(fill = "#FF6666") +
labs(
title = " Which school are they",
x = "School",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(mathData, aes(x = school)) +
geom_bar(fill = "skyblue") +
labs(
title = " Which school are they",
x = "School",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(porData, aes(x = sex)) +
geom_bar(fill = "#FF6666") +
labs(
title = "Count of Males and Females ",
x = "Sex",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(mathData, aes(x = sex)) +
geom_bar(fill = "skyblue") +
labs(
title = "Count of Males and Females ",
x = "Sex",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(porData, aes(x = as.factor(age))) +
geom_bar(fill = "#FF6666") +
labs(
title = "Count of Students by Age",
x = "Age",
y = "Count"
) + stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(mathData, aes(x = as.factor(age))) +
geom_bar(fill = "skyblue") +
labs(
title = "Count of Students by Age",
x = "Age",
y = "Count"
) + stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
boxplot(porData$age, horizontal=TRUE, main="Horizontal Box Plot of Data", xlab="Value", col="#FF6666")
boxplot(mathData$age, horizontal=TRUE, main="Horizontal Box Plot of Data", xlab="Value", col="skyblue")
ggplot(porData, aes(x = address)) +
geom_bar(fill = "#FF6666") +
labs(
title = " By where does student live",
x = "Area",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(mathData, aes(x = address)) +
geom_bar(fill = "skyblue") +
labs(
title = " By where does student live",
x = "Area",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(porData, aes(x = famsize)) +
geom_bar(fill = "#FF6666") +
labs(
title = " By family size ",
x = "No of people",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(mathData, aes(x = famsize)) +
geom_bar(fill = "skyblue") +
labs(
title = " By family size ",
x = "No of people",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(porData, aes(x = Pstatus)) +
geom_bar(fill = "#FF6666") +
labs(
title = " Parents living status ",
x = "Status",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(mathData, aes(x = Pstatus)) +
geom_bar(fill = "skyblue") +
labs(
title = " Parents living status ",
x = "Status",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(porData, aes(x = factor(Medu))) +
geom_bar(fill = "#FF6666") +
labs(
title = "Mother's Education",
x = "Education",
y = "Count"
) +
theme_minimal() +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5)
ggplot(mathData, aes(x = factor(Medu))) +
geom_bar(fill = "skyblue") +
labs(
title = "Mother's Education",
x = "Education",
y = "Count"
) +
theme_minimal() +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5)
ggplot(porData, aes(x = factor(Fedu))) +
geom_bar(fill = "#FF6666") +
labs(
title = "Father's Education",
x = "Education",
y = "Count"
) +
theme_minimal() +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5)
ggplot(mathData, aes(x = factor(Fedu))) +
geom_bar(fill = "skyblue") +
labs(
title = "Father's Education",
x = "Education",
y = "Count"
) +
theme_minimal() +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5)
ggplot(porData, aes(x = Mjob)) +
geom_bar(fill = "#FF6666") +
labs(
title = "Mother's Job",
x = "Job",
y = "Count"
) +
theme_minimal()+
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5)
ggplot(mathData, aes(x = Mjob)) +
geom_bar(fill = "skyblue") +
labs(
title = "Mother's Job",
x = "Job",
y = "Count"
) +
theme_minimal()+
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5)
ggplot(porData, aes(x = Fjob)) +
geom_bar(fill = "#FF6666") +
labs(
title = "Father's Job",
x = "Job",
y = "Count"
) +
theme_minimal() +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5)
ggplot(mathData, aes(x = Fjob)) +
geom_bar(fill = "skyblue") +
labs(
title = "Father's Job",
x = "Job",
y = "Count"
) +
theme_minimal() +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5)
ggplot(porData, aes(x = reason)) +
geom_bar(fill = "#FF6666") +
labs(
title = "Reason to select the school",
x = "Reason",
y = "Count"
) +
theme_minimal()+
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5)
ggplot(mathData, aes(x = reason)) +
geom_bar(fill = "skyblue") +
labs(
title = "Reason to select the school",
x = "Reason",
y = "Count"
) +
theme_minimal()+
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5)
ggplot(porData, aes(x = guardian)) +
geom_bar(fill = "#FF6666") +
labs(
title = "Guardians",
x = "Gaurdian",
y = "Count"
) +
theme_minimal()+
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5)
ggplot(mathData, aes(x = guardian)) +
geom_bar(fill = "skyblue") +
labs(
title = "Guardians",
x = "Gaurdian",
y = "Count"
) +
theme_minimal()+
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5)
ggplot(porData, aes(x = traveltime)) +
geom_bar(fill = "#FF6666") +
labs(
title = "Traveltime for schools by hour",
x = "Traveltime",
y = "Count"
) +
theme_minimal()+
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5)
ggplot(mathData, aes(x = traveltime)) +
geom_bar(fill = "skyblue") +
labs(
title = "Traveltime for schools by hour",
x = "Traveltime",
y = "Count"
) +
theme_minimal()+
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5)
ggplot(porData, aes(x = studytime)) +
geom_bar(fill = "#FF6666") +
labs(
title = "Study preparation by hours",
x = "studytime",
y = "Count"
) +
theme_minimal()+
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5)
ggplot(mathData, aes(x = studytime)) +
geom_bar(fill = "skyblue") +
labs(
title = "Study preparation by hours",
x = "studytime",
y = "Count"
) +
theme_minimal()+
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5)
ggplot(porData, aes(x = failures)) +
geom_bar(fill = "#FF6666") +
labs(
title = "Failures in past",
x = "Failure",
y = "Count"
) +
theme_minimal()+
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5)
ggplot(mathData, aes(x = failures)) +
geom_bar(fill = "skyblue") +
labs(
title = "Failures in past",
x = "Failure",
y = "Count"
) +
theme_minimal()+
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5)
ggplot(porData, aes(x = schoolsup)) +
geom_bar(fill = "#FF6666") +
labs(
title = " School support ",
x = "support",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(mathData, aes(x = schoolsup)) +
geom_bar(fill = "skyblue") +
labs(
title = " School support ",
x = "support",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(porData, aes(x = famsup)) +
geom_bar(fill = "#FF6666") +
labs(
title = " family support ",
x = "support",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(mathData, aes(x = famsup)) +
geom_bar(fill = "skyblue") +
labs(
title = " family support ",
x = "support",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(porData, aes(x = paid)) +
geom_bar(fill = "#FF6666") +
labs(
title = " Are they gng for extra paid classes ",
x = "Paid classes",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(mathData, aes(x = paid)) +
geom_bar(fill = "skyblue") +
labs(
title = " Are they gng for extra paid classes ",
x = "Paid classes",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(porData, aes(x = activities)) +
geom_bar(fill = "#FF6666") +
labs(
title = " Extracurricular activities",
x = "Extra",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(mathData, aes(x = activities)) +
geom_bar(fill = "skyblue") +
labs(
title = " Extracurricular activities",
x = "Extra",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(porData, aes(x = nursery)) +
geom_bar(fill = "#FF6666") +
labs(
title = " Attended nursery school or not ",
x = "Nursery",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(mathData, aes(x = nursery)) +
geom_bar(fill = "skyblue") +
labs(
title = " Attended nursery school or not ",
x = "Nursery",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(porData, aes(x = higher)) +
geom_bar(fill = "#FF6666") +
labs(
title = " Are they planning on taking higher studies",
x = "Higher studies",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(mathData, aes(x = higher)) +
geom_bar(fill = "skyblue") +
labs(
title = " Are they planning on taking higher studies",
x = "Higher studies",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(porData, aes(x = internet)) +
geom_bar(fill = "#FF6666") +
labs(
title = "Internet acess at home",
x = "Internet",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(mathData, aes(x = internet)) +
geom_bar(fill = "skyblue") +
labs(
title = "Internet acess at home",
x = "Internet",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(porData, aes(x = romantic)) +
geom_bar(fill = "#FF6666") +
labs(
title = " Are they in romantic relationship ",
x = "Relationship",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(mathData, aes(x = romantic)) +
geom_bar(fill = "skyblue") +
labs(
title = " Are they in romantic relationship ",
x = "Relationship",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(porData, aes(x = famrel)) +
geom_bar(fill = "#FF6666") +
labs(
title = " Family Relationship ",
x = "Famrel",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(mathData, aes(x = famrel)) +
geom_bar(fill = "skyblue") +
labs(
title = " Family Relationship ",
x = "Famrel",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(porData, aes(x = freetime)) +
geom_bar(fill = "#FF6666") +
labs(
title = " How much freetime ",
x = "Free time",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(mathData, aes(x = freetime)) +
geom_bar(fill = "skyblue") +
labs(
title = " How much freetime ",
x = "Free time",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(porData, aes(x = goout)) +
geom_bar(fill = "#FF6666") +
labs(
title = " Are they gng out with frnds ",
x = "gng out with frnds",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(mathData, aes(x = goout)) +
geom_bar(fill = "skyblue") +
labs(
title = " Are they gng out with frnds ",
x = "gng out with frnds",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(porData, aes(x = Dalc)) +
geom_bar(fill = "#FF6666") +
labs(
title = " Workday alc consump ",
x = "Drinking alc",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(mathData, aes(x = Dalc)) +
geom_bar(fill = "skyblue") +
labs(
title = " Workday alc consump ",
x = "Drinking alc",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(porData, aes(x = Walc)) +
geom_bar(fill = "#FF6666") +
labs(
title = " Weekend alc consump ",
x = "Drinking alc",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(mathData, aes(x = Walc)) +
geom_bar(fill = "skyblue") +
labs(
title = " Weekend alc consump ",
x = "Drinking alc",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(porData, aes(x = health)) +
geom_bar(fill = "#FF6666") +
labs(
title = " Health Status ",
x = "Health",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(mathData, aes(x = health)) +
geom_bar(fill = "skyblue") +
labs(
title = " Health Status ",
x = "Health",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(porData, aes(x = absences)) +
geom_bar(fill = "#FF6666") +
labs(
title = " Absent days ",
x = "No of days",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
ggplot(mathData, aes(x = absences)) +
geom_bar(fill = "skyblue") +
labs(
title = " Absent days ",
x = "No of days",
y = "Count"
) +
stat_count(geom = "text", aes(label = ..count..), vjust = -0.5) +
theme_minimal()
Separating Numeric and Non numeric Data
non_numeric_columns <- names(porData)[sapply(porData, function(col) !is.numeric(col))]
cat("These are non-numeric columns:", non_numeric_columns, "\n\n")
These are non-numeric columns: school sex address famsize Pstatus Mjob Fjob reason guardian schoolsup famsup paid activities nursery higher internet romantic
numeric_columns <- names(porData)[sapply(porData, is.numeric)]
cat("These are numeric columns:", numeric_columns, "\n")
These are numeric columns: age Medu Fedu traveltime studytime failures famrel freetime goout Dalc Walc health absences G1 G2 G3
Checking all the unique values that a non Numeric feature has
non_numeric <- names(porData)[sapply(porData, function(col) !is.numeric(col))]
for (col in non_numeric){
print(paste("The column ", col, "has ", length(unique(porData[[col]])), ' unique values they are: '))
print((unique(porData[[col]])))
cat('\n')
}
[1] "The column school has 2 unique values they are: "
[1] "GP" "MS"
[1] "The column sex has 2 unique values they are: "
[1] "F" "M"
[1] "The column address has 2 unique values they are: "
[1] "U" "R"
[1] "The column famsize has 2 unique values they are: "
[1] "GT3" "LE3"
[1] "The column Pstatus has 2 unique values they are: "
[1] "A" "T"
[1] "The column Mjob has 5 unique values they are: "
[1] "at_home" "health" "other" "services" "teacher"
[1] "The column Fjob has 5 unique values they are: "
[1] "teacher" "other" "services" "health" "at_home"
[1] "The column reason has 4 unique values they are: "
[1] "course" "other" "home" "reputation"
[1] "The column guardian has 3 unique values they are: "
[1] "mother" "father" "other"
[1] "The column schoolsup has 2 unique values they are: "
[1] "yes" "no"
[1] "The column famsup has 2 unique values they are: "
[1] "no" "yes"
[1] "The column paid has 2 unique values they are: "
[1] "no" "yes"
[1] "The column activities has 2 unique values they are: "
[1] "no" "yes"
[1] "The column nursery has 2 unique values they are: "
[1] "yes" "no"
[1] "The column higher has 2 unique values they are: "
[1] "yes" "no"
[1] "The column internet has 2 unique values they are: "
[1] "no" "yes"
[1] "The column romantic has 2 unique values they are: "
[1] "no" "yes"
Separating Binary & Multiple unique values within features
binary_columns <- c()
multi_unique_columns <- c()
for (col in non_numeric) {
num_unique <- length(unique(porData[[col]]))
if (num_unique == 2) {
binary_columns <- c(binary_columns, col)
} else if (num_unique > 2) {
multi_unique_columns <- c(multi_unique_columns, col)
}
}
print(paste("Binary columns:", paste(binary_columns, collapse = ", ")))
[1] "Binary columns: school, sex, address, famsize, Pstatus, schoolsup, famsup, paid, activities, nursery, higher, internet, romantic"
cat('\n')
print(paste("Multi unique value columns:", paste(multi_unique_columns, collapse = ", ")))
[1] "Multi unique value columns: Mjob, Fjob, reason, guardian"
Turing binary unique values into numeric
If a features has value ‘yes’ it’s encoded as 1, ‘no’ as 0. Rest all shown below.
cat('----Portugese Data----\n\n')
----Portugese Data----
for (col in binary_columns) {
if ("yes" %in% porData[[col]] && "no" %in% porData[[col]]) {
porData[[col]] <- ifelse(porData[[col]] == "yes", 1, 0)
} else {
unique_vals <- unique(porData[[col]])
porData[[col]] <- ifelse(porData[[col]] == unique_vals[1], 0, 1)
print(paste("Portugese's Feature:",col, ' is encoded ', unique_vals[1],' as 0 ', unique_vals[2],' as 1'))
}
}
[1] "Portugese's Feature: school is encoded GP as 0 MS as 1"
[1] "Portugese's Feature: sex is encoded F as 0 M as 1"
[1] "Portugese's Feature: address is encoded U as 0 R as 1"
[1] "Portugese's Feature: famsize is encoded GT3 as 0 LE3 as 1"
[1] "Portugese's Feature: Pstatus is encoded A as 0 T as 1"
cat('\n----Math Data----\n\n')
----Math Data----
# Doing for Math data too
for (col in binary_columns) {
if ("yes" %in% mathData[[col]] && "no" %in% mathData[[col]]) {
mathData[[col]] <- ifelse(mathData[[col]] == "yes", 1, 0)
} else {
unique_vals <- unique(mathData[[col]])
mathData[[col]] <- ifelse(mathData[[col]] == unique_vals[1], 0, 1)
print(paste("Math's Feature:",col, ' is encoded ', unique_vals[1],' as 0 ', unique_vals[2],' as 1'))
}
}
[1] "Math's Feature: school is encoded GP as 0 MS as 1"
[1] "Math's Feature: sex is encoded F as 0 M as 1"
[1] "Math's Feature: address is encoded U as 0 R as 1"
[1] "Math's Feature: famsize is encoded GT3 as 0 LE3 as 1"
[1] "Math's Feature: Pstatus is encoded A as 0 T as 1"
Performing 1-hot encoding for Multivalued columns
cat('----Portugese Data----\n\n')
----Portugese Data----
for (col in multi_unique_columns) {
formula_str <- paste("~ 0 +", col)
one_hot <- model.matrix(as.formula(formula_str), data = porData)
one_hot_df <- as.data.frame(one_hot)
colnames(one_hot_df) <- gsub("^.\\.", col, colnames(one_hot_df))
porData <- cbind(porData, one_hot_df)
porData[[col]] <- NULL
}
head(porData)
cat('\n----Math Data----\n\n')
----Math Data----
# for Math data
for (col in multi_unique_columns) {
formula_str <- paste("~ 0 +", col)
one_hot <- model.matrix(as.formula(formula_str), data = mathData)
one_hot_df <- as.data.frame(one_hot)
colnames(one_hot_df) <- gsub("^.\\.", col, colnames(one_hot_df))
mathData <- cbind(mathData, one_hot_df)
mathData[[col]] <- NULL
}
head(mathData)
We’ll Focus only on Grade 3(G3) which is the final Grade
Mothers education or Fathers education
Fathers Education
#Por's Model
modelPor_Fedu <- lm(G3 ~ Fedu, data = porData)
summary(modelPor_Fedu)
Call:
lm(formula = G3 ~ Fedu, data = porData)
Residuals:
Min 1Q Median 3Q Max
-12.9594 -1.7153 -0.0932 2.0406 7.9068
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 10.4711 0.2883 36.314 < 2e-16 ***
Fedu 0.6221 0.1129 5.512 5.12e-08 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.16 on 647 degrees of freedom
Multiple R-squared: 0.04486, Adjusted R-squared: 0.04338
F-statistic: 30.39 on 1 and 647 DF, p-value: 5.117e-08
plot(porData$Fedu, porData$G3, main="Fathers's Education (Fedu) vs. Final Grade (G3) for Portuguese Data", xlab="Father's Education", ylab="Final Grade (G3)")
abline(modelPor_Fedu, col="red")
#Math's Model
modelMath_Fedu <- lm(G3 ~ Fedu, data = mathData)
summary(modelMath_Fedu)
Call:
lm(formula = G3 ~ Fedu, data = mathData)
Residuals:
Min 1Q Median 3Q Max
-11.3642 -1.9014 0.5614 2.9196 9.2777
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 8.7967 0.5763 15.264 < 2e-16 ***
Fedu 0.6419 0.2099 3.058 0.00238 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4.534 on 393 degrees of freedom
Multiple R-squared: 0.02324, Adjusted R-squared: 0.02076
F-statistic: 9.352 on 1 and 393 DF, p-value: 0.00238
plot(mathData$Fedu, mathData$G3, main="Fathers's Education (Fedu) vs. Final Grade (G3) for Math Data", xlab="Father's Education", ylab="Final Grade (G3)")
abline(modelMath_Fedu, col="red")
Mothers Education
modelPor_Medu <- lm(G3 ~ Medu, data = porData)
summary(modelPor_Medu)
Call:
lm(formula = G3 ~ Medu, data = porData)
Residuals:
Min 1Q Median 3Q Max
-12.9217 -1.5541 0.0783 2.0783 7.1298
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 10.1864 0.2998 33.982 < 2e-16 ***
Medu 0.6838 0.1087 6.293 5.75e-10 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.139 on 647 degrees of freedom
Multiple R-squared: 0.05767, Adjusted R-squared: 0.05622
F-statistic: 39.6 on 1 and 647 DF, p-value: 5.752e-10
plot(porData$Medu, porData$G3, main="Mother's Education (Medu) vs. Final Grade (G3) for Portuguese Data", xlab="Mother's Education", ylab="Final Grade (G3)")
abline(modelPor_Medu, col="red")
#Math's Model
modelMath_Medu <- lm(G3 ~ Medu, data = mathData)
summary(modelMath_Medu)
Call:
lm(formula = G3 ~ Medu, data = mathData)
Residuals:
Min 1Q Median 3Q Max
-11.5517 -1.7342 0.4483 3.2202 9.2658
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.9167 0.6097 12.98 < 2e-16 ***
Medu 0.9088 0.2061 4.41 1.34e-05 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4.478 on 393 degrees of freedom
Multiple R-squared: 0.04715, Adjusted R-squared: 0.04473
F-statistic: 19.45 on 1 and 393 DF, p-value: 1.336e-05
plot(mathData$Medu, mathData$G3, main="Mother's Education (Medu) vs. Final Grade (G3) for Math Data ", xlab="Mother's Education", ylab="Final Grade (G3)")
abline(modelMath_Medu, col="red")
Let’s look Combination of Both Mother’s and Father’s Education impact on your test scores
modelPor_Edu <- lm(G3 ~ Medu + Fedu , data = porData)
summary(modelPor_Edu)
Call:
lm(formula = G3 ~ Medu + Fedu, data = porData)
Residuals:
Min 1Q Median 3Q Max
-13.1385 -1.5689 -0.0537 1.9463 7.5159
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.9791 0.3176 31.42 < 2e-16 ***
Medu 0.5051 0.1423 3.55 0.000414 ***
Fedu 0.2848 0.1468 1.94 0.052793 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.132 on 646 degrees of freedom
Multiple R-squared: 0.06313, Adjusted R-squared: 0.06023
F-statistic: 21.77 on 2 and 646 DF, p-value: 7.115e-10
coefplot(modelPor_Edu)
modelMath_Edu <- lm(G3 ~ Medu + Fedu , data = mathData)
summary(modelMath_Edu)
Call:
lm(formula = G3 ~ Medu + Fedu, data = mathData)
Residuals:
Min 1Q Median 3Q Max
-11.6344 -1.7275 0.3901 3.2260 9.2725
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.8205 0.6478 12.073 < 2e-16 ***
Medu 0.8359 0.2638 3.168 0.00165 **
Fedu 0.1176 0.2654 0.443 0.65796
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4.482 on 392 degrees of freedom
Multiple R-squared: 0.04763, Adjusted R-squared: 0.04277
F-statistic: 9.802 on 2 and 392 DF, p-value: 7.013e-05
coefplot(modelMath_Edu)
Does spending more time on studying actually improves grades
modelPor_Study = lm(G3 ~ studytime, data=porData)
summary(modelPor_Study)
Call:
lm(formula = G3 ~ studytime, data = porData)
Residuals:
Min 1Q Median 3Q Max
-11.9735 -1.9463 0.0265 2.0265 7.0265
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 10.0278 0.3115 32.191 < 2e-16 ***
studytime 0.9728 0.1483 6.562 1.09e-10 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.131 on 647 degrees of freedom
Multiple R-squared: 0.06239, Adjusted R-squared: 0.06095
F-statistic: 43.06 on 1 and 647 DF, p-value: 1.091e-10
plot(porData$studytime, porData$G3, main="Study Time (studytime) vs. Final Grade (G3) for Portuguese Data", xlab="Study time", ylab="Final Grade (G3)")
abline(modelPor_Study, col="red")
modelMath_Study <- lm(G3 ~ studytime, data=mathData)
summary(modelMath_Study)
Call:
lm(formula = G3 ~ studytime, data = mathData)
Residuals:
Min 1Q Median 3Q Max
-11.4643 -1.8623 0.5357 3.0697 9.1377
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.3283 0.6033 15.463 <2e-16 ***
studytime 0.5340 0.2741 1.949 0.0521 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4.565 on 393 degrees of freedom
Multiple R-squared: 0.009569, Adjusted R-squared: 0.007049
F-statistic: 3.797 on 1 and 393 DF, p-value: 0.05206
plot(mathData$studytime, mathData$G3, main="Study Time (studytime) vs. Final Grade (G3) for Math Data", xlab="Study time", ylab="Final Grade (G3)")
abline(modelMath_Study, col="red")
NA
NA
NA
Problem with above data is data is ordinal and range is limited How to deal with ordinal data in Regression: https://stats.stackexchange.com/questions/164689/ordinal-data-in-regression
absences - number of school absences (numeric: from 0 to 93)
modelPor_Absences <- lm(G3 ~ absences, data=porData)
summary(modelPor_Absences)
Call:
lm(formula = G3 ~ absences, data = porData)
Residuals:
Min 1Q Median 3Q Max
-12.1388 -1.8207 -0.1388 1.9884 7.1157
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 12.13880 0.16099 75.399 <2e-16 ***
absences -0.06361 0.02725 -2.334 0.0199 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.22 on 647 degrees of freedom
Multiple R-squared: 0.00835, Adjusted R-squared: 0.006817
F-statistic: 5.448 on 1 and 647 DF, p-value: 0.0199
plot(porData$absences, porData$G3, main="Absences vs. Final Grade (G3) for Portuguese Data", xlab="Number of Absences", ylab="Final Grade (G3)")
abline(modelPor_Absences, col="red")
modelMath_Absences <- lm(G3 ~ absences, data=mathData)
summary(modelMath_Absences)
Call:
lm(formula = G3 ~ absences, data = mathData)
Residuals:
Min 1Q Median 3Q Max
-10.3033 -2.3033 0.5007 3.4811 9.6183
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 10.30327 0.28347 36.347 <2e-16 ***
absences 0.01961 0.02886 0.679 0.497
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4.585 on 393 degrees of freedom
Multiple R-squared: 0.001173, Adjusted R-squared: -0.001369
F-statistic: 0.4615 on 1 and 393 DF, p-value: 0.4973
plot(mathData$absences, mathData$G3, main="Absences vs. Final Grade (G3) for Math Data", xlab="Number of Absences", ylab="Final Grade (G3)")
abline(modelMath_Absences, col="red")
multiplot(modelPor_Absences, modelMath_Absences, names=c("Portuguese Model", "Math Model"))
NA
NA
Daily Alcohol Consumption
modelPor_Dalc <- lm(G3 ~ Dalc, data=porData)
summary(modelPor_Dalc)
Call:
lm(formula = G3 ~ Dalc, data = porData)
Residuals:
Min 1Q Median 3Q Max
-12.2652 -1.5501 -0.2652 1.7348 7.1650
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 12.9804 0.2371 54.75 < 2e-16 ***
Dalc -0.7151 0.1344 -5.32 1.43e-07 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.165 on 647 degrees of freedom
Multiple R-squared: 0.04191, Adjusted R-squared: 0.04043
F-statistic: 28.3 on 1 and 647 DF, p-value: 1.432e-07
plot(porData$Dalc, porData$G3, main="Workday Alcohol Consumption (Dalc) vs. Final Grade (G3) for Portuguese Data", xlab="Dalc", ylab="Final Grade (G3)")
abline(modelPor_Dalc, col="red")
modelMath_Dalc <- lm(G3 ~ Dalc, data=mathData)
summary(modelMath_Dalc)
Call:
lm(formula = G3 ~ Dalc, data = mathData)
Residuals:
Min 1Q Median 3Q Max
-10.5504 -1.9881 0.4496 3.4496 9.4496
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 10.8316 0.4476 24.201 <2e-16 ***
Dalc -0.2811 0.2591 -1.085 0.278
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4.58 on 393 degrees of freedom
Multiple R-squared: 0.002988, Adjusted R-squared: 0.0004508
F-statistic: 1.178 on 1 and 393 DF, p-value: 0.2785
plot(mathData$Dalc, mathData$G3, main="Workday Alcohol Consumption (Dalc) vs. Final Grade (G3) for Math Data", xlab="Dalc", ylab="Final Grade (G3)")
abline(modelMath_Dalc, col="red")
multiplot(modelPor_Dalc,modelMath_Dalc, names=c('Daily Alc consumption for Portuguese Data', 'Daily Alc consumption for Math Data'))
Weekly Alcohol consumption
modelPor_Walc <- lm(G3 ~ Walc, data=porData)
summary(modelPor_Walc)
Call:
lm(formula = G3 ~ Walc, data = porData)
Residuals:
Min 1Q Median 3Q Max
-12.4748 -1.5863 -0.0306 1.9694 7.8579
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 12.91911 0.25470 50.723 <2e-16 ***
Walc -0.44426 0.09733 -4.564 6e-06 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.182 on 647 degrees of freedom
Multiple R-squared: 0.03119, Adjusted R-squared: 0.0297
F-statistic: 20.83 on 1 and 647 DF, p-value: 5.999e-06
plot(porData$Walc, porData$G3, main="Weekend Alcohol Consumption (Walc) vs. Final Grade (G3) for Portuguese Data", xlab="Walc", ylab="Final Grade (G3)")
abline(modelPor_Walc, col="red")
modelMath_Walc <- lm(G3 ~ Walc, data=mathData)
summary(modelMath_Walc)
Call:
lm(formula = G3 ~ Walc, data = mathData)
Residuals:
Min 1Q Median 3Q Max
-10.6537 -2.0071 0.3463 3.3463 9.3463
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 10.8385 0.4708 23.019 <2e-16 ***
Walc -0.1848 0.1792 -1.031 0.303
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4.581 on 393 degrees of freedom
Multiple R-squared: 0.002698, Adjusted R-squared: 0.00016
F-statistic: 1.063 on 1 and 393 DF, p-value: 0.3032
plot(mathData$Walc, mathData$G3, main="Weekend Alcohol Consumption (Walc) vs. Final Grade (G3) for Math Data", xlab="Walc", ylab="Final Grade (G3)")
abline(modelMath_Walc, col="red")
multiplot(modelPor_Walc,modelMath_Walc, names=c('Weekly Alc consumption for Portuguese Data', 'Weekly Alc consumption for Math Data'))
Overview of all coeff’s till now
multiplot(modelPor_Dalc,modelMath_Dalc,modelPor_Walc,modelMath_Walc, names=c('Daily Alc consumption for Portuguese Data', 'Daily Alc consumption for Math Data','Weekly Alc consumption for Portuguese Data', 'Weekly Alc consumption for Math Data'))
modelPor_Multi <- lm(G3 ~ Medu + Fedu + studytime + Dalc + Walc , data = porData)
summary(modelPor_Multi)
Call:
lm(formula = G3 ~ Medu + Fedu + studytime + Dalc + Walc, data = porData)
Residuals:
Min 1Q Median 3Q Max
-12.899 -1.522 0.149 1.790 7.997
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.6366 0.4839 19.913 < 2e-16 ***
Medu 0.4275 0.1368 3.125 0.00186 **
Fedu 0.3117 0.1409 2.213 0.02724 *
studytime 0.7796 0.1459 5.342 1.28e-07 ***
Dalc -0.5245 0.1618 -3.242 0.00125 **
Walc -0.1060 0.1184 -0.895 0.37114
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.996 on 643 degrees of freedom
Multiple R-squared: 0.1467, Adjusted R-squared: 0.1401
F-statistic: 22.11 on 5 and 643 DF, p-value: < 2.2e-16
coefplot(modelPor_Multi, title ='Coefficient Plot for modelPor_Multi model')
modelMath_Multi <- lm(G3 ~ Medu + Fedu + studytime + Dalc + Walc , data = mathData)
summary(modelMath_Multi)
Call:
lm(formula = G3 ~ Medu + Fedu + studytime + Dalc + Walc, data = mathData)
Residuals:
Min 1Q Median 3Q Max
-12.4099 -1.8812 0.4922 3.0686 8.7659
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.27335 1.02407 7.102 5.87e-12 ***
Medu 0.80668 0.26555 3.038 0.00254 **
Fedu 0.13993 0.26589 0.526 0.59901
studytime 0.42755 0.27917 1.532 0.12645
Dalc -0.25334 0.33370 -0.759 0.44820
Walc 0.03321 0.23389 0.142 0.88717
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4.478 on 389 degrees of freedom
Multiple R-squared: 0.05679, Adjusted R-squared: 0.04467
F-statistic: 4.684 on 5 and 389 DF, p-value: 0.0003635
coefplot(modelMath_Multi,title ='Coefficient Plot for modelMath_Multi model')
####Let’s do Correlation Matrix #### Correlation is not causation!!!!
corMath <- cor(mathData)
corrplot(corMath)
corPor <- cor(porData)
corrplot(corPor)
Let’s focus only on G3
G3Por_cor <- corPor[,'G3']
plot_ly(x = names(G3Por_cor), y = G3Por_cor, type = 'bar') %>%
layout(title = "Correlation of Final Grade(G3) with Other Variables for Portuguese Data", yaxis = list(title = "Correlation Coefficient"))
G3Math_cor <- corMath[,'G3']
plot_ly(x = names(G3Math_cor), y = G3Math_cor, type = 'bar') %>%
layout(title = "Correlation of Final Grade(G3) with Other Variables for Math Data", yaxis = list(title = "Correlation Coefficient"))
sorted_names <- names(sort(G3Por_cor))
factor_names <- factor(names(G3Por_cor), levels = sorted_names)
plot_ly(x = factor_names, y = G3Por_cor, type = 'bar') %>%
layout(title = "Correlation of G3 with Other Variables in Increasing Order for Portuguese Data",
yaxis = list(title = "Correlation Coefficient"))
# For Math Data
sorted_names <- names(sort(G3Math_cor))
factor_names <- factor(names(G3Math_cor), levels = sorted_names)
plot_ly(x = factor_names, y = G3Math_cor, type = 'bar') %>%
layout(title = "Correlation of G3 with Other Variables in Increasing Order for Math Data",
yaxis = list(title = "Correlation Coefficient"))
Dropping G1,G2
mathData <- mathData %>% select(-G1, -G2)
porData <- porData %>% select(-G1, -G2)
G3Por_cor <- cor(porData)[,'G3']
G3Math_cor <- cor(mathData)[,'G3']
sorted_names <- names(sort(G3Por_cor))
factor_names <- factor(names(G3Por_cor), levels = sorted_names)
plot_ly(x = factor_names, y = G3Por_cor, type = 'bar') %>%
layout(title = "Correlation of G3 with Other Variables in Increasing Order for Portuguese Data",
yaxis = list(title = "Correlation Coefficient"))
# For Math Data
sorted_names <- names(sort(G3Math_cor))
factor_names <- factor(names(G3Math_cor), levels = sorted_names)
plot_ly(x = factor_names, y = G3Math_cor, type = 'bar') %>%
layout(title = "Correlation of G3 with Other Variables in Increasing Order for Math Data",
yaxis = list(title = "Correlation Coefficient"))
NA
NA
Let’s Include all of the features and build a linear regression model for 2 datasets:
set.seed(123)
cat('\n\n--------------For Portuguese Data--------------\n\n')
--------------For Portuguese Data--------------
split = sample.split(porData$G3, SplitRatio = 0.8)
train_data = subset(porData, split == TRUE)
test_data = subset(porData, split == FALSE)
model <- lm(G3 ~ ., data = train_data)
print(summary(model))
Call:
lm(formula = G3 ~ ., data = train_data)
Residuals:
Min 1Q Median 3Q Max
-12.8424 -1.4794 0.0023 1.4845 7.4442
Coefficients: (4 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 10.595056 2.426206 4.367 1.54e-05 ***
school -1.002573 0.298814 -3.355 0.000856 ***
sex -0.570677 0.281030 -2.031 0.042840 *
age 0.123936 0.112631 1.100 0.271721
address -0.366429 0.295274 -1.241 0.215219
famsize 0.366823 0.272491 1.346 0.178878
Pstatus 0.172506 0.388575 0.444 0.657282
Medu 0.030633 0.169053 0.181 0.856285
Fedu 0.151645 0.154765 0.980 0.327657
traveltime 0.083146 0.178179 0.467 0.640968
studytime 0.342011 0.157634 2.170 0.030523 *
failures -1.583033 0.227284 -6.965 1.09e-11 ***
schoolsup -1.140145 0.408453 -2.791 0.005458 **
famsup 0.130538 0.252790 0.516 0.605820
paid -0.433639 0.593443 -0.731 0.465309
activities 0.499374 0.251682 1.984 0.047809 *
nursery -0.321192 0.310804 -1.033 0.301927
higher 2.087402 0.436772 4.779 2.34e-06 ***
internet 0.145208 0.305052 0.476 0.634284
romantic -0.459084 0.258972 -1.773 0.076910 .
famrel 0.144389 0.129589 1.114 0.265750
freetime -0.109753 0.125437 -0.875 0.382031
goout 0.040540 0.127380 0.318 0.750426
Dalc -0.276450 0.174284 -1.586 0.113353
Walc -0.067564 0.135947 -0.497 0.619425
health -0.150171 0.086810 -1.730 0.084295 .
absences -0.028027 0.027945 -1.003 0.316396
Mjobat_home -0.616686 0.557737 -1.106 0.269413
Mjobhealth 0.301072 0.599020 0.503 0.615471
Mjobother -0.574547 0.492701 -1.166 0.244146
Mjobservices -0.398453 0.481010 -0.828 0.407874
Mjobteacher NA NA NA NA
Fjobat_home -0.631090 0.737575 -0.856 0.392630
Fjobhealth -1.367956 0.836118 -1.636 0.102478
Fjobother -0.556288 0.588647 -0.945 0.345120
Fjobservices -0.911826 0.601177 -1.517 0.129992
Fjobteacher NA NA NA NA
reasoncourse -0.273371 0.338073 -0.809 0.419138
reasonhome -0.008687 0.370506 -0.023 0.981303
reasonother -0.543228 0.468622 -1.159 0.246950
reasonreputation NA NA NA NA
guardianfather -0.612864 0.621041 -0.987 0.324221
guardianmother -0.908096 0.572423 -1.586 0.113305
guardianother NA NA NA NA
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.671 on 480 degrees of freedom
Multiple R-squared: 0.3713, Adjusted R-squared: 0.3202
F-statistic: 7.268 on 39 and 480 DF, p-value: < 2.2e-16
# for Math Data
cat('\n\n--------------For Math Data--------------\n\n')
--------------For Math Data--------------
split = sample.split(mathData$G3, SplitRatio = 0.8)
train_data = subset(mathData, split == TRUE)
test_data = subset(mathData, split == FALSE)
modelMath <- lm(G3 ~ ., data = train_data)
print(summary(modelMath))
Call:
lm(formula = G3 ~ ., data = train_data)
Residuals:
Min 1Q Median 3Q Max
-13.760 -2.062 0.311 2.613 7.814
Coefficients: (4 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 20.41831 5.60320 3.644 0.00032 ***
school 1.39175 0.85952 1.619 0.10654
sex 1.08939 0.54982 1.981 0.04854 *
age -0.41581 0.24952 -1.666 0.09675 .
address -0.49086 0.63513 -0.773 0.44027
famsize 0.47400 0.53699 0.883 0.37816
Pstatus -0.34959 0.83489 -0.419 0.67574
Medu 0.38375 0.35351 1.086 0.27862
Fedu -0.09021 0.30754 -0.293 0.76949
traveltime -0.42055 0.38289 -1.098 0.27300
studytime 0.75869 0.31843 2.383 0.01786 *
failures -1.82335 0.39302 -4.639 5.39e-06 ***
schoolsup -1.70011 0.75385 -2.255 0.02490 *
famsup -1.31462 0.52507 -2.504 0.01286 *
paid 0.15593 0.52576 0.297 0.76701
activities -0.47595 0.49367 -0.964 0.33582
nursery -0.13653 0.62434 -0.219 0.82706
higher -0.81942 1.28562 -0.637 0.52441
internet 0.65831 0.67124 0.981 0.32758
romantic -1.09035 0.51121 -2.133 0.03381 *
famrel 0.20636 0.27044 0.763 0.44607
freetime 0.41741 0.26704 1.563 0.11917
goout -0.80849 0.24645 -3.281 0.00117 **
Dalc -0.24910 0.37181 -0.670 0.50343
Walc 0.36805 0.28345 1.298 0.19519
health -0.14291 0.18535 -0.771 0.44135
absences 0.08098 0.03510 2.307 0.02178 *
Mjobat_home 0.52898 1.13016 0.468 0.64011
Mjobhealth 2.38136 1.00162 2.378 0.01811 *
Mjobother 0.84924 0.91205 0.931 0.35259
Mjobservices 1.96054 0.85268 2.299 0.02223 *
Mjobteacher NA NA NA NA
Fjobat_home -1.80303 1.45869 -1.236 0.21748
Fjobhealth -1.87288 1.49097 -1.256 0.21012
Fjobother -3.02958 1.04472 -2.900 0.00403 **
Fjobservices -2.74051 1.07082 -2.559 0.01102 *
Fjobteacher NA NA NA NA
reasoncourse -0.63484 0.64778 -0.980 0.32793
reasonhome -0.42988 0.66502 -0.646 0.51854
reasonother 0.26341 0.92462 0.285 0.77594
reasonreputation NA NA NA NA
guardianfather -1.10625 1.13885 -0.971 0.33220
guardianmother -1.06594 1.04171 -1.023 0.30708
guardianother NA NA NA NA
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4.064 on 278 degrees of freedom
Multiple R-squared: 0.3091, Adjusted R-squared: 0.2122
F-statistic: 3.189 on 39 and 278 DF, p-value: 1.363e-08
Dropping additional Features
porData <- porData %>% select(-Mjobteacher, -Fjobteacher,-reasonreputation,-guardianother)
mathData <- mathData %>% select(-Mjobteacher, -Fjobteacher,-reasonreputation,-guardianother)
set.seed(123)
# For Portuguese Data
cat('\n\n--------------For Portuguese Data--------------\n\n')
--------------For Portuguese Data--------------
split = sample.split(porData$G3, SplitRatio = 0.8)
train_data = subset(porData, split == TRUE)
test_data = subset(porData, split == FALSE)
model <- lm(G3 ~ ., data = train_data)
print(summary(model))
Call:
lm(formula = G3 ~ ., data = train_data)
Residuals:
Min 1Q Median 3Q Max
-12.8424 -1.4794 0.0023 1.4845 7.4442
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 10.595056 2.426206 4.367 1.54e-05 ***
school -1.002573 0.298814 -3.355 0.000856 ***
sex -0.570677 0.281030 -2.031 0.042840 *
age 0.123936 0.112631 1.100 0.271721
address -0.366429 0.295274 -1.241 0.215219
famsize 0.366823 0.272491 1.346 0.178878
Pstatus 0.172506 0.388575 0.444 0.657282
Medu 0.030633 0.169053 0.181 0.856285
Fedu 0.151645 0.154765 0.980 0.327657
traveltime 0.083146 0.178179 0.467 0.640968
studytime 0.342011 0.157634 2.170 0.030523 *
failures -1.583033 0.227284 -6.965 1.09e-11 ***
schoolsup -1.140145 0.408453 -2.791 0.005458 **
famsup 0.130538 0.252790 0.516 0.605820
paid -0.433639 0.593443 -0.731 0.465309
activities 0.499374 0.251682 1.984 0.047809 *
nursery -0.321192 0.310804 -1.033 0.301927
higher 2.087402 0.436772 4.779 2.34e-06 ***
internet 0.145208 0.305052 0.476 0.634284
romantic -0.459084 0.258972 -1.773 0.076910 .
famrel 0.144389 0.129589 1.114 0.265750
freetime -0.109753 0.125437 -0.875 0.382031
goout 0.040540 0.127380 0.318 0.750426
Dalc -0.276450 0.174284 -1.586 0.113353
Walc -0.067564 0.135947 -0.497 0.619425
health -0.150171 0.086810 -1.730 0.084295 .
absences -0.028027 0.027945 -1.003 0.316396
Mjobat_home -0.616686 0.557737 -1.106 0.269413
Mjobhealth 0.301072 0.599020 0.503 0.615471
Mjobother -0.574547 0.492701 -1.166 0.244146
Mjobservices -0.398453 0.481010 -0.828 0.407874
Fjobat_home -0.631090 0.737575 -0.856 0.392630
Fjobhealth -1.367956 0.836118 -1.636 0.102478
Fjobother -0.556288 0.588647 -0.945 0.345120
Fjobservices -0.911826 0.601177 -1.517 0.129992
reasoncourse -0.273371 0.338073 -0.809 0.419138
reasonhome -0.008687 0.370506 -0.023 0.981303
reasonother -0.543228 0.468622 -1.159 0.246950
guardianfather -0.612864 0.621041 -0.987 0.324221
guardianmother -0.908096 0.572423 -1.586 0.113305
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.671 on 480 degrees of freedom
Multiple R-squared: 0.3713, Adjusted R-squared: 0.3202
F-statistic: 7.268 on 39 and 480 DF, p-value: < 2.2e-16
# Compute RMSE for Portuguese Data
predicted_values <- predict(model, newdata = test_data)
actual_values <- test_data$G3
rmse_por <- rmse(actual_values, predicted_values)
cat('RMSE for Portuguese Data:', rmse_por, '\n')
RMSE for Portuguese Data: 2.749893
# For Math Data
cat('\n\n--------------For Math Data--------------\n\n')
--------------For Math Data--------------
split = sample.split(mathData$G3, SplitRatio = 0.8)
train_data = subset(mathData, split == TRUE)
test_data = subset(mathData, split == FALSE)
modelMath <- lm(G3 ~ ., data = train_data)
print(summary(modelMath))
Call:
lm(formula = G3 ~ ., data = train_data)
Residuals:
Min 1Q Median 3Q Max
-13.760 -2.062 0.311 2.613 7.814
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 20.41831 5.60320 3.644 0.00032 ***
school 1.39175 0.85952 1.619 0.10654
sex 1.08939 0.54982 1.981 0.04854 *
age -0.41581 0.24952 -1.666 0.09675 .
address -0.49086 0.63513 -0.773 0.44027
famsize 0.47400 0.53699 0.883 0.37816
Pstatus -0.34959 0.83489 -0.419 0.67574
Medu 0.38375 0.35351 1.086 0.27862
Fedu -0.09021 0.30754 -0.293 0.76949
traveltime -0.42055 0.38289 -1.098 0.27300
studytime 0.75869 0.31843 2.383 0.01786 *
failures -1.82335 0.39302 -4.639 5.39e-06 ***
schoolsup -1.70011 0.75385 -2.255 0.02490 *
famsup -1.31462 0.52507 -2.504 0.01286 *
paid 0.15593 0.52576 0.297 0.76701
activities -0.47595 0.49367 -0.964 0.33582
nursery -0.13653 0.62434 -0.219 0.82706
higher -0.81942 1.28562 -0.637 0.52441
internet 0.65831 0.67124 0.981 0.32758
romantic -1.09035 0.51121 -2.133 0.03381 *
famrel 0.20636 0.27044 0.763 0.44607
freetime 0.41741 0.26704 1.563 0.11917
goout -0.80849 0.24645 -3.281 0.00117 **
Dalc -0.24910 0.37181 -0.670 0.50343
Walc 0.36805 0.28345 1.298 0.19519
health -0.14291 0.18535 -0.771 0.44135
absences 0.08098 0.03510 2.307 0.02178 *
Mjobat_home 0.52898 1.13016 0.468 0.64011
Mjobhealth 2.38136 1.00162 2.378 0.01811 *
Mjobother 0.84924 0.91205 0.931 0.35259
Mjobservices 1.96054 0.85268 2.299 0.02223 *
Fjobat_home -1.80303 1.45869 -1.236 0.21748
Fjobhealth -1.87288 1.49097 -1.256 0.21012
Fjobother -3.02958 1.04472 -2.900 0.00403 **
Fjobservices -2.74051 1.07082 -2.559 0.01102 *
reasoncourse -0.63484 0.64778 -0.980 0.32793
reasonhome -0.42988 0.66502 -0.646 0.51854
reasonother 0.26341 0.92462 0.285 0.77594
guardianfather -1.10625 1.13885 -0.971 0.33220
guardianmother -1.06594 1.04171 -1.023 0.30708
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4.064 on 278 degrees of freedom
Multiple R-squared: 0.3091, Adjusted R-squared: 0.2122
F-statistic: 3.189 on 39 and 278 DF, p-value: 1.363e-08
# Compute RMSE for Math Data
predicted_values_math <- predict(modelMath, newdata = test_data)
actual_values_math <- test_data$G3
rmse_math <- rmse(actual_values_math, predicted_values_math)
cat('RMSE for Math Data:', rmse_math, '\n')
RMSE for Math Data: 4.610122
coefplot(model, title = 'Coefficient Plot for Portuguese Data')
coefplot(modelMath, title = 'Coefficient Plot for Math Data')
Ordered Coeff
coefficients <- coef(model)
sorted_coefficients <- coefficients[order(abs(coefficients), decreasing = TRUE)]
coefplot(model,title='Coefficient Plot for Portuguese Data', sort = "magnitude")
coefficientsMath <- coef(modelMath)
sorted_coefficientsMath <- coefficientsMath[order(abs(coefficientsMath), decreasing = TRUE)]
coefplot(modelMath,title='Coefficient Plot for Math Data', sort = "magnitude")
sorted_names <- names(sort(sorted_coefficients))
factor_names <- factor(names(sorted_coefficients), levels = sorted_names)
plot_ly(x = factor_names, y = sorted_coefficients, type = 'bar') %>%
layout(title = "Sorted Coefficients in Increasing Order for Portuguese Data",
yaxis = list(title = "Coefficient Value"))
# For math Data
sorted_names <- names(sort(sorted_coefficientsMath))
factor_names <- factor(names(sorted_coefficientsMath), levels = sorted_names)
plot_ly(x = factor_names, y = sorted_coefficientsMath, type = 'bar') %>%
layout(title = "Sorted Coefficients in Increasing Order for Math Data",
yaxis = list(title = "Coefficient Value"))
NA
important_features_Por <- c("school", "failures", "schoolsup", "higher", "sex", "studytime", "activities", "romantic", "health")
important_features_Math <- c("sex", "studytime", "failures", "schoolsup", "famsup", "romantic", "goout", "absences", "Mjobhealth", "Mjobservices", "Fjobother", "Fjobservices")
set.seed(123)
# Modeling for Portuguese Data
formulaPor <- as.formula(paste("G3 ~", paste(important_features_Por, collapse=" + ")))
model <- lm(formulaPor, data=porData)
summary(model)
Call:
lm(formula = formulaPor, data = porData)
Residuals:
Min 1Q Median 3Q Max
-11.7674 -1.4722 -0.0888 1.6632 7.7743
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 11.19384 0.53940 20.752 < 2e-16 ***
school -1.57430 0.23143 -6.802 2.37e-11 ***
failures -1.49067 0.19056 -7.823 2.15e-14 ***
schoolsup -1.45680 0.35555 -4.097 4.72e-05 ***
higher 1.99586 0.37067 5.384 1.02e-07 ***
sex -0.70079 0.22943 -3.054 0.002348 **
studytime 0.49433 0.13561 3.645 0.000289 ***
activities 0.23278 0.21659 1.075 0.282882
romantic -0.45693 0.22528 -2.028 0.042948 *
health -0.18431 0.07447 -2.475 0.013586 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.706 on 639 degrees of freedom
Multiple R-squared: 0.3082, Adjusted R-squared: 0.2984
F-statistic: 31.63 on 9 and 639 DF, p-value: < 2.2e-16
coefficients <- coef(model)
sorted_coefficients <- coefficients[order(abs(coefficients), decreasing = TRUE)]
coefplot(model, title='Coefficient Plot for Portuguese Data', sort = "magnitude")
predicted_values <- predict(model, newdata = test_data)
actual_values <- test_data$G3
rmse_por <- rmse(actual_values, predicted_values)
cat('RMSE for Portuguese Data:', rmse_por, '\n')
RMSE for Portuguese Data: 4.260732
# Modeling for Math Data
formulaMath <- as.formula(paste("G3 ~", paste(important_features_Math, collapse=" + ")))
modelMath <- lm(formulaMath, data=mathData)
summary(modelMath)
Call:
lm(formula = formulaMath, data = mathData)
Residuals:
Min 1Q Median 3Q Max
-12.7405 -1.8524 0.3052 2.6915 8.5256
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 11.77060 1.06696 11.032 < 2e-16 ***
sex 1.25145 0.44404 2.818 0.00508 **
studytime 0.53875 0.26488 2.034 0.04265 *
failures -2.09888 0.28806 -7.286 1.84e-12 ***
schoolsup -1.12573 0.63121 -1.783 0.07531 .
famsup -0.71675 0.43799 -1.636 0.10256
romantic -1.10400 0.45072 -2.449 0.01476 *
goout -0.44629 0.18822 -2.371 0.01823 *
absences 0.05947 0.02636 2.256 0.02463 *
Mjobhealth 2.37999 0.75824 3.139 0.00183 **
Mjobservices 1.47722 0.49133 3.007 0.00282 **
Fjobother -1.12730 0.58200 -1.937 0.05349 .
Fjobservices -1.11196 0.64556 -1.722 0.08579 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4.095 on 382 degrees of freedom
Multiple R-squared: 0.2255, Adjusted R-squared: 0.2012
F-statistic: 9.269 on 12 and 382 DF, p-value: 9.263e-16
coefficientsMath <- coef(modelMath)
sorted_coefficientsMath <- coefficientsMath[order(abs(coefficientsMath), decreasing = TRUE)]
coefplot(modelMath, title='Coefficient Plot for Math Data', sort = "magnitude")
predicted_values_math <- predict(modelMath, newdata = test_data)
actual_values_math <- test_data$G3
rmse_math <- rmse(actual_values_math, predicted_values_math)
cat('RMSE for Math Data:', rmse_math, '\n')
RMSE for Math Data: 4.170892
Adding a new column called ‘Avg_alc’ which is the Average between Weekday Alcohol consumption & Weekend Alcohol consumption.
mathData$Avg_alc <- (mathData$Dalc + mathData$Walc)/2
mathData <- mathData %>% select(-Dalc, -Walc)
porData$Avg_alc <- (porData$Dalc + porData$Walc)/2
porData <- porData %>% select(-Dalc, -Walc)
G3Por_cor <- cor(porData)[,'G3']
G3Math_cor <- cor(mathData)[,'G3']
sorted_names <- names(sort(G3Por_cor))
factor_names <- factor(names(G3Por_cor), levels = sorted_names)
plot_ly(x = factor_names, y = G3Por_cor, type = 'bar') %>%
layout(title = "Correlation of G3 with Other Variables in Increasing Order for Portuguese Data",
yaxis = list(title = "Correlation Coefficient"))
# For Math Data
sorted_names <- names(sort(G3Math_cor))
factor_names <- factor(names(G3Math_cor), levels = sorted_names)
plot_ly(x = factor_names, y = G3Math_cor, type = 'bar') %>%
layout(title = "Correlation of G3 with Other Variables in Increasing Order for Math Data",
yaxis = list(title = "Correlation Coefficient"))
NA
set.seed(123)
# For Portuguese Data
cat('\n\n--------------For Portuguese Data--------------\n\n')
--------------For Portuguese Data--------------
split = sample.split(porData$G3, SplitRatio = 0.8)
train_data = subset(porData, split == TRUE)
test_data = subset(porData, split == FALSE)
model <- lm(G3 ~ ., data = train_data)
print(summary(model))
Call:
lm(formula = G3 ~ ., data = train_data)
Residuals:
Min 1Q Median 3Q Max
-12.8649 -1.4690 -0.0324 1.4546 7.4947
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 10.65632 2.42383 4.396 1.36e-05 ***
school -1.01607 0.29816 -3.408 0.00071 ***
sex -0.56464 0.28080 -2.011 0.04490 *
age 0.11966 0.11244 1.064 0.28779
address -0.36487 0.29514 -1.236 0.21696
famsize 0.37507 0.27216 1.378 0.16880
Pstatus 0.17681 0.38837 0.455 0.64913
Medu 0.01999 0.16840 0.119 0.90557
Fedu 0.15936 0.15437 1.032 0.30242
traveltime 0.07806 0.17798 0.439 0.66114
studytime 0.33635 0.15739 2.137 0.03310 *
failures -1.57897 0.22712 -6.952 1.18e-11 ***
schoolsup -1.16414 0.40707 -2.860 0.00442 **
famsup 0.11970 0.25228 0.474 0.63537
paid -0.45206 0.59270 -0.763 0.44600
activities 0.49781 0.25156 1.979 0.04840 *
nursery -0.31702 0.31062 -1.021 0.30796
higher 2.08859 0.43658 4.784 2.29e-06 ***
internet 0.14621 0.30492 0.480 0.63179
romantic -0.46852 0.25856 -1.812 0.07061 .
famrel 0.14097 0.12946 1.089 0.27673
freetime -0.11918 0.12477 -0.955 0.33997
goout 0.05859 0.12511 0.468 0.63980
health -0.14354 0.08634 -1.663 0.09705 .
absences -0.02791 0.02793 -0.999 0.31823
Mjobat_home -0.62030 0.55748 -1.113 0.26639
Mjobhealth 0.34049 0.59653 0.571 0.56842
Mjobother -0.58721 0.49221 -1.193 0.23346
Mjobservices -0.40723 0.48066 -0.847 0.39729
Fjobat_home -0.61661 0.73701 -0.837 0.40321
Fjobhealth -1.37558 0.83570 -1.646 0.10041
Fjobother -0.53498 0.58773 -0.910 0.36315
Fjobservices -0.89124 0.60031 -1.485 0.13830
reasoncourse -0.28587 0.33753 -0.847 0.39745
reasonhome -0.03253 0.36903 -0.088 0.92979
reasonother -0.58365 0.46542 -1.254 0.21044
guardianfather -0.60289 0.62063 -0.971 0.33183
guardianmother -0.87823 0.57084 -1.538 0.12458
Avg_alc -0.31080 0.14491 -2.145 0.03247 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.67 on 481 degrees of freedom
Multiple R-squared: 0.3705, Adjusted R-squared: 0.3208
F-statistic: 7.45 on 38 and 481 DF, p-value: < 2.2e-16
# Compute RMSE for Portuguese Data
predicted_values <- predict(model, newdata = test_data)
actual_values <- test_data$G3
rmse_por <- rmse(actual_values, predicted_values)
cat('RMSE for Portuguese Data:', rmse_por, '\n')
RMSE for Portuguese Data: 2.74529
# For Math Data
cat('\n\n--------------For Math Data--------------\n\n')
--------------For Math Data--------------
split = sample.split(mathData$G3, SplitRatio = 0.8)
train_data = subset(mathData, split == TRUE)
test_data = subset(mathData, split == FALSE)
modelMath <- lm(G3 ~ ., data = train_data)
print(summary(modelMath))
Call:
lm(formula = G3 ~ ., data = train_data)
Residuals:
Min 1Q Median 3Q Max
-13.796 -1.941 0.337 2.661 7.693
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 21.00200 5.57812 3.765 0.000203 ***
school 1.35881 0.85920 1.581 0.114900
sex 1.10007 0.54988 2.001 0.046408 *
age -0.43299 0.24907 -1.738 0.083235 .
address -0.48185 0.63524 -0.759 0.448771
famsize 0.46514 0.53706 0.866 0.387187
Pstatus -0.30511 0.83407 -0.366 0.714791
Medu 0.32002 0.34856 0.918 0.359346
Fedu -0.04950 0.30526 -0.162 0.871291
traveltime -0.45310 0.38178 -1.187 0.236318
studytime 0.74948 0.31840 2.354 0.019272 *
failures -1.85267 0.39217 -4.724 3.67e-06 ***
schoolsup -1.70497 0.75404 -2.261 0.024522 *
famsup -1.35127 0.52409 -2.578 0.010441 *
paid 0.16646 0.52581 0.317 0.751796
activities -0.46764 0.49374 -0.947 0.344385
nursery -0.13691 0.62450 -0.219 0.826625
higher -0.96184 1.27907 -0.752 0.452695
internet 0.64538 0.67131 0.961 0.337197
romantic -1.07531 0.51116 -2.104 0.036301 *
famrel 0.19819 0.27040 0.733 0.464198
freetime 0.38454 0.26534 1.449 0.148395
goout -0.75222 0.24085 -3.123 0.001977 **
health -0.14243 0.18540 -0.768 0.442999
absences 0.08333 0.03504 2.378 0.018073 *
Mjobat_home 0.42393 1.12619 0.376 0.706888
Mjobhealth 2.43241 1.00075 2.431 0.015705 *
Mjobother 0.70353 0.90209 0.780 0.436116
Mjobservices 1.87902 0.84950 2.212 0.027783 *
Fjobat_home -1.86253 1.45802 -1.277 0.202510
Fjobhealth -1.92779 1.49048 -1.293 0.196943
Fjobother -2.92886 1.04075 -2.814 0.005238 **
Fjobservices -2.70820 1.07068 -2.529 0.011976 *
reasoncourse -0.67507 0.64686 -1.044 0.297570
reasonhome -0.46557 0.66436 -0.701 0.484019
reasonother 0.10717 0.91329 0.117 0.906672
guardianfather -1.07906 1.13886 -0.947 0.344212
guardianmother -1.03534 1.04159 -0.994 0.321085
Avg_alc 0.22662 0.30861 0.734 0.463367
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4.065 on 279 degrees of freedom
Multiple R-squared: 0.3063, Adjusted R-squared: 0.2118
F-statistic: 3.241 on 38 and 279 DF, p-value: 1.114e-08
# Compute RMSE for Math Data
predicted_values_math <- predict(modelMath, newdata = test_data)
actual_values_math <- test_data$G3
rmse_math <- rmse(actual_values_math, predicted_values_math)
cat('RMSE for Math Data:', rmse_math, '\n')
RMSE for Math Data: 4.619303
coefficients <- coef(model)
sorted_coefficients <- coefficients[order(abs(coefficients), decreasing = TRUE)]
sorted_names <- names(sort(sorted_coefficients))
factor_names <- factor(names(sorted_coefficients), levels = sorted_names)
plot_ly(x = factor_names, y = sorted_coefficients, type = 'bar') %>%
layout(title = "Sorted Coefficients in Increasing Order for Portuguese Data",
yaxis = list(title = "Coefficient Value"))
# For math Data
coefficientsMath <- coef(modelMath)
sorted_coefficientsMath <- coefficientsMath[order(abs(coefficientsMath), decreasing = TRUE)]
sorted_names <- names(sort(sorted_coefficientsMath))
factor_names <- factor(names(sorted_coefficientsMath), levels = sorted_names)
plot_ly(x = factor_names, y = sorted_coefficientsMath, type = 'bar') %>%
layout(title = "Sorted Coefficients in Increasing Order for Math Data",
yaxis = list(title = "Coefficient Value"))
NA
combinedData <- rbind(mathData, porData)
View(combinedData)
corValues <- cor(combinedData)
AlcAll_cor <- cor(combinedData)[,'Avg_alc']
sorted_names <- names(sort(AlcAll_cor))
factor_names <- factor(names(AlcAll_cor), levels = sorted_names)
plot_ly(x = factor_names, y = AlcAll_cor, type = 'bar') %>%
layout(title = "Correlation of Avg_alc with Other Variables in Increasing Order for Combined data",
yaxis = list(title = "Correlation Coefficient"))
cat('\n\n--------------For Combined Data--------------\n\n')
--------------For Combined Data--------------
split = sample.split(combinedData$Avg_alc, SplitRatio = 0.8)
train_data = subset(combinedData, split == TRUE)
test_data = subset(combinedData, split == FALSE)
modelAll <- lm(Avg_alc ~ ., data = train_data)
print(summary(modelAll))
Call:
lm(formula = Avg_alc ~ ., data = train_data)
Residuals:
Min 1Q Median 3Q Max
-1.87298 -0.55788 -0.09884 0.42136 3.08504
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.212289 0.618233 -0.343 0.731403
school 0.019069 0.076548 0.249 0.803340
sex 0.575101 0.063773 9.018 < 2e-16 ***
age 0.067855 0.027271 2.488 0.013044 *
address 0.128984 0.071675 1.800 0.072310 .
famsize 0.136354 0.064166 2.125 0.033891 *
Pstatus 0.046166 0.097483 0.474 0.635930
Medu 0.017123 0.040828 0.419 0.675043
Fedu 0.025090 0.036432 0.689 0.491227
traveltime 0.055761 0.043399 1.285 0.199216
studytime -0.140734 0.036894 -3.815 0.000147 ***
failures 0.005874 0.050611 0.116 0.907632
schoolsup 0.120645 0.096340 1.252 0.210836
famsup 0.033660 0.061422 0.548 0.583833
paid 0.147789 0.072547 2.037 0.041966 *
activities -0.088996 0.058999 -1.508 0.131836
nursery -0.172589 0.071919 -2.400 0.016634 *
higher -0.071643 0.118568 -0.604 0.545859
internet 0.030993 0.075741 0.409 0.682501
romantic 0.072490 0.061485 1.179 0.238750
famrel -0.209515 0.031013 -6.756 2.74e-11 ***
freetime 0.011326 0.030188 0.375 0.707632
goout 0.318226 0.026359 12.073 < 2e-16 ***
health 0.059558 0.021262 2.801 0.005216 **
absences 0.017552 0.004816 3.645 0.000285 ***
G3 -0.002976 0.008139 -0.366 0.714719
Mjobat_home 0.026614 0.133563 0.199 0.842109
Mjobhealth -0.283918 0.134431 -2.112 0.034997 *
Mjobother -0.149291 0.112653 -1.325 0.185474
Mjobservices -0.101318 0.107234 -0.945 0.345032
Fjobat_home 0.135014 0.179394 0.753 0.451907
Fjobhealth 0.282642 0.188363 1.501 0.133875
Fjobother 0.332376 0.133422 2.491 0.012935 *
Fjobservices 0.529629 0.136397 3.883 0.000112 ***
reasoncourse -0.076977 0.078567 -0.980 0.327497
reasonhome -0.026409 0.084473 -0.313 0.754645
reasonother 0.214595 0.113489 1.891 0.059003 .
guardianfather 0.223536 0.143051 1.563 0.118536
guardianmother 0.073268 0.132245 0.554 0.579716
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.8025 on 796 degrees of freedom
Multiple R-squared: 0.3773, Adjusted R-squared: 0.3476
F-statistic: 12.69 on 38 and 796 DF, p-value: < 2.2e-16
predicted_values_math <- predict(modelAll, newdata = test_data)
actual_values_math <- test_data$Avg_alc
rmse_math <- rmse(actual_values_math, predicted_values_math)
cat('RMSE for All Data for Alcohol Consumption:', rmse_math, '\n')
RMSE for All Data for Alcohol Consumption: 0.8816963
coefficients <- coef(modelAll)
sorted_coefficients <- coefficients[order(abs(coefficients), decreasing = TRUE)]
sorted_names <- names(sort(sorted_coefficients))
factor_names <- factor(names(sorted_coefficients), levels = sorted_names)
plot_ly(x = factor_names, y = sorted_coefficients, type = 'bar') %>%
layout(title = "Sorted Coefficients in Increasing Order for Alcohol Consumption for combined Data ",
yaxis = list(title = "Coefficient Value"))